EDA
cormat <- winequality_red %>% select_if(is.numeric) %>% cor()
cormat <- round(cormat ,2)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
melted_cormat <- melt(cormat)
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill = value))+
geom_tile(color = "white")+
geom_text(aes(Var2, Var1, label = value), color = "black", size = 4)

library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
winequality_red %>% filter(quality_level %in% c("high_quality", "low_quality")) %>% plot_ly(x = .$pH, y = .$alcohol, z = .$`residual sugar`, type="scatter3d", mode="markers", color = .$quality_level)
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
PCA
wine_pca <- winequality_red %>% select(where(is.numeric)) %>% # retain only numeric columns
prcomp(scale = TRUE)
wine_pca %>%
augment(winequality_red) %>% # add original dataset back in
ggplot(aes(.fittedPC1, .fittedPC2, color = quality)) +
geom_point(size = 1.5)

wine_pca %>%
tidy(matrix = "eigenvalues") %>%
ggplot(aes(PC, percent)) +
geom_col(fill = "#56B4E9", alpha = 0.8) +
scale_x_continuous(breaks = 1:9) +
scale_y_continuous(
labels = scales::percent_format(),
expand = expansion(mult = c(0, 0.01))
)

# define arrow style for plotting
arrow_style <- arrow(
angle = 20, ends = "first", type = "closed", length = grid::unit(8, "pt")
)
# plot rotation matrix
wine_pca %>%
tidy(matrix = "rotation") %>%
pivot_wider(names_from = "PC", names_prefix = "PC", values_from = "value") %>%
ggplot(aes(PC1, PC2)) +
geom_segment(xend = 0, yend = 0, arrow = arrow_style) +
geom_text(
aes(label = column),
hjust = 1, nudge_x = -0.02,
color = "#904C2F"
) +
xlim(-1.25, .5) + ylim(-.5, 1) +
coord_fixed() + # fix aspect ratio to 1:1
theme_minimal_grid(12)
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_text).

points <- winequality_red %>% select(where(is.numeric))
kclusts <-
tibble(k = 1:9) %>%
mutate(
kclust = map(k, ~kmeans(points, .x)),
tidied = map(kclust, tidy),
glanced = map(kclust, glance),
augmented = map(kclust, augment, points)
)
clusters <-
kclusts %>%
unnest(cols = c(tidied))
assignments <-
kclusts %>%
unnest(cols = c(augmented))
clusterings <-
kclusts %>%
unnest(cols = c(glanced))
p1 <-
ggplot(assignments, aes(x = assignments$pH, y = assignments$chlorides)) +
geom_point(aes(color = .cluster), alpha = 0.8) +
facet_wrap(~ k)
p1
## Warning: Use of `assignments$pH` is discouraged. Use `pH` instead.
## Warning: Use of `assignments$chlorides` is discouraged. Use `chlorides` instead.

Tidy way
library(tidymodels)
winequality_red_tidy <- winequality_red %>% select(where(is.numeric), quality)
pca_wine <- recipe( quality ~., data = winequality_red_tidy) %>%
step_normalize(all_predictors()) %>%
step_pca(all_predictors())
pca_prep <- prep(pca_wine)
pca_prep
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 10
##
## Training data contained 1599 data points and no missing data.
##
## Operations:
##
## Centering and scaling for fixed acidity, volatile acidity, ... [trained]
## PCA extraction with fixed acidity, volatile acidity, ... [trained]
tidied_pca <- tidy(pca_prep, 2)
tidied_pca %>%
filter(component %in% paste0("PC", 1:5)) %>%
mutate(component = fct_inorder(component)) %>%
ggplot(aes(value, terms, fill = terms)) +
geom_col(show.legend = FALSE) +
facet_wrap(~component, nrow = 1) +
labs(y = NULL)

library(tidytext)
tidied_pca %>%
filter(component %in% paste0("PC", 1:4)) %>%
group_by(component) %>%
top_n(8, abs(value)) %>%
ungroup() %>%
mutate(terms = reorder_within(terms, abs(value), component)) %>%
ggplot(aes(abs(value), terms, fill = value > 0)) +
geom_col() +
facet_wrap(~component, scales = "free_y") +
scale_y_reordered() +
labs(
x = "Absolute value of contribution",
y = NULL, fill = "Positive?"
)

juice(pca_prep) %>%
ggplot(aes(PC1, PC2, label = quality)) +
geom_point(aes(color = quality), alpha = 0.7, size = 2) +
geom_text(check_overlap = TRUE, hjust = "inward") +
labs(color = NULL)

umap
library(embed)
umap_rec <- recipe(quality ~., data = winequality_red_tidy) %>%
step_normalize(all_predictors()) %>%
step_umap(all_predictors())
umap_prep <- prep(umap_rec)
umap_prep
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 10
##
## Training data contained 1599 data points and no missing data.
##
## Operations:
##
## Centering and scaling for fixed acidity, volatile acidity, ... [trained]
## UMAP embedding for fixed acidity, volatile acidity, ... [trained]
juice(umap_prep) %>%
ggplot(aes(umap_1, umap_2, label = quality)) +
geom_point(aes(color = quality), alpha = 0.7, size = 2) +
geom_text(check_overlap = TRUE, hjust = "inward") +
labs(color = NULL)

autoencoders
rw <- read_csv("winequality-red.csv") %>% mutate(quality = as.factor(quality))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## `fixed acidity` = col_double(),
## `volatile acidity` = col_double(),
## `citric acid` = col_double(),
## `residual sugar` = col_double(),
## chlorides = col_double(),
## `free sulfur dioxide` = col_double(),
## `total sulfur dioxide` = col_double(),
## density = col_double(),
## pH = col_double(),
## sulphates = col_double(),
## alcohol = col_double(),
## quality = col_double()
## )
set.seed(1353)
wine_split <- initial_split(rw)
train_data <- training(wine_split)
test_data <- testing(wine_split)
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 1 days 14 hours
## H2O cluster timezone: Europe/Berlin
## H2O data parsing timezone: UTC
## H2O cluster version: 3.32.0.1
## H2O cluster version age: 2 months and 6 days
## H2O cluster name: H2O_started_from_R_Herdt_iaq382
## H2O cluster total nodes: 1
## H2O cluster total memory: 0.98 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4
## R Version: R version 4.0.2 (2020-06-22)
ae_model <- h2o.deeplearning(x = 1:11, training_frame = as.h2o(train_data),
autoencoder = TRUE,
reproducible = TRUE,
seed = 148,
hidden = c(10,10,10), epochs = 100,activation = "Tanh",
validation_frame = as.h2o(test_data))
## Warning in use.package("data.table"): data.table cannot be used without R
## package bit64 version 0.9.7 or higher. Please upgrade to take advangage of
## data.table speedups.
##
|
| | 0%
|
|======================================================================| 100%
## Warning in use.package("data.table"): data.table cannot be used without R
## package bit64 version 0.9.7 or higher. Please upgrade to take advangage of
## data.table speedups.
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|= | 2%
|
|=================== | 27%
|
|===================================== | 53%
|
|======================================================= | 78%
|
|======================================================================| 100%
h2o.scoreHistory(ae_model)%>%head()
## Scoring History:
## timestamp duration training_speed epochs iterations samples
## 1 2020-12-15 08:07:47 0.033 sec 0,00000 obs/sec 0.00000 0 0.000000
## 2 2020-12-15 08:07:47 0.082 sec 27272 obs/sec 1.00000 1 1200.000000
## 3 2020-12-15 08:07:47 0.118 sec 32000 obs/sec 2.00000 2 2400.000000
## 4 2020-12-15 08:07:47 0.156 sec 33333 obs/sec 3.00000 3 3600.000000
## 5 2020-12-15 08:07:47 0.193 sec 34782 obs/sec 4.00000 4 4800.000000
## 6 2020-12-15 08:07:47 0.234 sec 34682 obs/sec 5.00000 5 6000.000000
## training_rmse training_mse validation_rmse validation_mse
## 1 0.16596 0.02754 0.16777 0.02815
## 2 0.07281 0.00530 0.07075 0.00501
## 3 0.06205 0.00385 0.06009 0.00361
## 4 0.05521 0.00305 0.05307 0.00282
## 5 0.05035 0.00254 0.04859 0.00236
## 6 0.04493 0.00202 0.04328 0.00187
test_autoencoder <- h2o.predict(ae_model, as.h2o(test_data))
## Warning in use.package("data.table"): data.table cannot be used without R
## package bit64 version 0.9.7 or higher. Please upgrade to take advangage of
## data.table speedups.
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
train_features <- h2o.deepfeatures(ae_model, as.h2o(train_data), layer = 2) %>%
as.data.frame() %>%cbind(train_data %>%
mutate(quality_level = case_when(quality == 3 | quality == 4 ~ "low_quality"
,quality == 5 | quality == 6 ~ "medium_quality"
,quality == 7 | quality == 8 ~ "high_quality")) %>% select(quality_level))
## Warning in use.package("data.table"): data.table cannot be used without R
## package bit64 version 0.9.7 or higher. Please upgrade to take advangage of
## data.table speedups.
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
ggplot(train_features, aes(x = DF.L2.C1, y = DF.L2.C4, color = quality_level)) +
geom_point(alpha = 0.9,size=1.5)+theme_bw()

train_features %>% plot_ly(x = .$DF.L2.C1, y = .$DF.L2.C2, z = .$DF.L2.C3, type="scatter3d", mode="markers", color = .$quality_level)
Anomaly detection
anomaly <- h2o.anomaly(ae_model, as.h2o(test_data)) %>%
as.data.frame() %>%
tibble::rownames_to_column() %>%cbind(test_data %>%
mutate(Class = case_when(quality == 3 | quality == 4 ~ "low_quality"
,quality == 5 | quality == 6 ~ "medium_quality"
,quality == 7 | quality == 8 ~ "high_quality")) %>% select(Class))
## Warning in use.package("data.table"): data.table cannot be used without R
## package bit64 version 0.9.7 or higher. Please upgrade to take advangage of
## data.table speedups.
##
|
| | 0%
|
|======================================================================| 100%
mean_mse <- anomaly %>%
group_by(Class) %>%
summarise(mean = mean(Reconstruction.MSE))
## `summarise()` ungrouping output (override with `.groups` argument)
anomaly<-anomaly%>%mutate_if(is.character,as.factor)
anomaly$rowname=as.numeric(anomaly$rowname)
anomaly%>%head()
## rowname Reconstruction.MSE Class
## 1 1 3.100766e-05 medium_quality
## 2 112 8.333021e-05 medium_quality
## 3 223 9.802102e-05 medium_quality
## 4 334 1.859427e-05 medium_quality
## 5 345 1.329144e-04 medium_quality
## 6 356 5.681355e-04 medium_quality
wine.anon = h2o.anomaly(ae_model, as.h2o(test_data), per_feature=FALSE)
## Warning in use.package("data.table"): data.table cannot be used without R
## package bit64 version 0.9.7 or higher. Please upgrade to take advangage of
## data.table speedups.
##
|
| | 0%
|
|======================================================================| 100%
MSE<-wine.anon%>%as_tibble()
MSE$Index<-1:length(MSE$Reconstruction.MSE)
ggplot(MSE,aes(x=Index,y=sort(Reconstruction.MSE)))+geom_point()+ylab("Reconstruction.MSE")

anomaly%>%ggplot( aes(x = rowname, y = Reconstruction.MSE,color=Class)) +
geom_point(alpha = 0.5) +
#geom_hline(data = mean_mse, aes(yintercept = mean)) +
geom_hline(yintercept =0.004,color="#3288BD") +
labs(x = "instance number", color = "Class")

anomaly <- anomaly %>%
mutate(outlier = ifelse(Reconstruction.MSE > 0.004 , "outlier", "no_outlier"))
anomaly %>%
group_by(Class, outlier) %>%
dplyr:: summarise(n = n()) %>%
mutate(freq = n / sum(n))
## `summarise()` regrouping output by 'Class' (override with `.groups` argument)
## # A tibble: 5 x 4
## # Groups: Class [3]
## Class outlier n freq
## <fct> <chr> <int> <dbl>
## 1 high_quality no_outlier 44 1
## 2 low_quality no_outlier 14 0.933
## 3 low_quality outlier 1 0.0667
## 4 medium_quality no_outlier 338 0.994
## 5 medium_quality outlier 2 0.00588